原始数据

导入

In [218]:
# df = pd.read_excel(r'../ML/ML计算/特征过滤法.xlsx',sheet_name='0')

# df=pd.read_csv(r'small_loan.csv')
# df.drop('id', inplace=True, axis=1)

df = pd.read_csv(r'../01ML_Case/house_price/train.csv')
df.drop('Id', inplace=True, axis=1)

df.shape
Out[218]:
(1460, 80)
In [219]:
df.head()
Out[219]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 NaN Attchd 2003.0 RFn 2 548 TA TA Y 0 61 0 0 0 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 TA Attchd 1976.0 RFn 2 460 TA TA Y 298 0 0 0 0 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 TA Attchd 2001.0 RFn 2 608 TA TA Y 0 42 0 0 0 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Gd Detchd 1998.0 Unf 3 642 TA TA Y 0 35 272 0 0 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 TA Attchd 2000.0 RFn 3 836 TA TA Y 192 84 0 0 0 0 NaN NaN NaN 0 12 2008 WD Normal 250000

report 1st

In [220]:
dqrs=DQReport(data=df, target='Y').SReport()
dqrs
# pde.dftohtml(dqrs)
Out[220]:
col_name dtype kinds null 空值率 nunique unique_value
39 HeatingQC object categorical 0 0.0% 5 [Po, Fa, Gd, TA, Ex]
29 BsmtQual object categorical 37 2.5% 4 [Fa, Ex, Gd, TA]
59 GarageFinish object categorical 81 5.5% 3 [Fin, RFn, Unf]
31 BsmtExposure object categorical 38 2.6% 4 [Mn, Gd, Av, No]
32 BsmtFinType1 object categorical 37 2.5% 6 [LwQ, Rec, BLQ, ALQ, GLQ, Unf]
... ... ... ... ... ... ... ...
43 2ndFlrSF int64 numeric 0 0.0% 417 [430, 892, 1028, 1031, 1037, 1038, 883, 882]
44 LowQualFinSF int64 numeric 0 0.0% 24 [371, 514, 513, 481, 479, 473, 420, 397]
45 GrLivArea int64 numeric 0 0.0% 861 [2230, 1803, 1764, 1797, 1787, 1784, 1775, 1771]
58 GarageYrBlt float64 numeric 81 5.5% 97 [1906.0, 1908.0, 1900.0, 1933.0, 1927.0, 1942....
79 SalePrice int64 numeric 0 0.0% 663 [372500, 90350, 119200, 176432, 375000, 381000...

80 rows × 7 columns

In [221]:
report_view(dqrs)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
str maybe num,  dtype: str, kind: num,  0
[]

str nunique>=8,: 8
['SaleType', 'Neighborhood', 'Condition1', 'Condition2', 'Exterior2nd', 'Exterior1st', 'RoofMatl', 'HouseStyle']

num maybe str,  dtype: num, kind: str,  10
['PoolArea', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'BsmtFullBath', 'YrSold', 'GarageCars']

num nunique<=8,: 10
['PoolArea', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'BsmtFullBath', 'YrSold', 'GarageCars']

num nunique>8,<=20,: 6
['MoSold', '3SsnPorch', 'MSSubClass', 'TotRmsAbvGrd', 'OverallQual', 'OverallCond']
****************************** str, 43
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
****************************** int, 34
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
****************************** float, 3
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

初始处理

ytype

In [222]:
# ytype='fenlei'#分类
ytype='huigui'#回归

Y (类型记得转化)

In [223]:
df.rename(columns={'SalePrice':'Y'},inplace=True)
# df.rename(columns={'response':'Y'},inplace=True)
# df['Y'] = df['Y'].map({"YES": 1, "NO": 0})
In [224]:
if ytype=='fenlei':
    df['Y']=df['Y'].astype(int)
else:
    df['Y']=df['Y'].astype(float)    
In [225]:
if ytype=='fenlei':
    # 分类
    dfc_str_col_plt_count(df,'Y')
else:
    # 回归
    print("Skewness: %f" % df['Y'].skew())
    print("Kurtosis: %f" % df['Y'].kurt())
    dfc_num_plt_huigui('Y',df)
    dfc_num_col_plt_value(df)
    
    # Ycut
    df['Ycut']=pd.qcut(df.Y, 4)
    df['Ycut']=df['Ycut'].astype(str)
    print(df.Ycut.value_counts())
Skewness: 1.882876
Kurtosis: 6.536282
(129975.0, 163000.0]     367
(163000.0, 214000.0]     366
(34899.999, 129975.0]    365
(214000.0, 755000.0]     362
Name: Ycut, dtype: int64

wvf符号替换为nan

In [226]:
df = wrong_value_fillna(wrong_value=list(wvf_txt())).fit_transform(df)
错误值替换为nan,共0个

-----column type转换

In [227]:
# df['age']=df['age'].astype(float)
# df['income']=df['income'].astype(float)
# df['children']=df['children'].astype(str)
# df_fill_na['age']=df_fill_na['age'].astype(float)
# df_fill_na['income']=df_fill_na['income'].astype(float)
In [228]:
# float可能为int的
dfc_type_mcol_float_may_int(dfc_num_float,df)
Out[228]:
[True, True, True]
In [229]:
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
****************************** str, 43
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
****************************** int, 33
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
****************************** float, 3
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

-----手动

In [230]:
# df['children']=df['children'].astype(str)

空值填充 df_fill_na

xgb_fill

In [231]:
# df_fill_na=xgb_fill().fit_transform(df.drop(columns=['Y']),df.Y) #无Y列出错

-----其他fill

In [232]:
# df['age'].fillna(df['age'].mode()[0],inplace=True)
# df['age']=df['age'].astype(int)

null列分析

In [233]:
dfc_null(df,0.05)
Out[233]:
total null rate
PoolQC 1453 0.995205
MiscFeature 1406 0.963014
Alley 1369 0.937671
Fence 1179 0.807534
FireplaceQu 690 0.472603
LotFrontage 259 0.177397
GarageYrBlt 81 0.055479
GarageType 81 0.055479
GarageQual 81 0.055479
GarageCond 81 0.055479
GarageFinish 81 0.055479

-----null列删除

In [234]:
df.drop(columns=dfc_null(df,0.2).index,inplace=True)
df.shape
Out[234]:
(1460, 76)

null列再次查看

In [235]:
df.isnull().sum()[df.isnull().sum()>0]
Out[235]:
LotFrontage     259
MasVnrType        8
MasVnrArea        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64

report 2nd

In [236]:
dqrs=DQReport(data=df, target='Y').SReport()
dqrs
# pde.dftohtml(dqrs)
Out[236]:
col_name dtype kinds null 空值率 nunique unique_value
37 Heating object categorical 0 0.0% 6 [Floor, OthW, Wall, Grav, GasW, GasA]
30 BsmtExposure object categorical 38 2.6% 4 [Mn, Gd, Av, No]
31 BsmtFinType1 object categorical 37 2.5% 6 [LwQ, Rec, BLQ, ALQ, GLQ, Unf]
33 BsmtFinType2 object categorical 38 2.6% 6 [GLQ, ALQ, BLQ, LwQ, Rec, Unf]
38 HeatingQC object categorical 0 0.0% 5 [Po, Fa, Gd, TA, Ex]
... ... ... ... ... ... ... ...
34 BsmtFinSF2 int64 numeric 0 0.0% 144 [169, 692, 713, 764, 352, 768, 791, 690]
32 BsmtFinSF1 int64 numeric 0 0.0% 637 [404, 222, 223, 224, 1441, 228, 1440, 1436]
24 MasVnrArea float64 numeric 8 0.5% 327 [67.0, 506.0, 459.0, 748.0, 603.0, 46.0, 922.0...
56 GarageYrBlt float64 numeric 81 5.5% 97 [1906.0, 1908.0, 1900.0, 1933.0, 1927.0, 1942....
0 MSSubClass int64 numeric 0 0.0% 15 [40, 180, 45, 75, 85, 190, 90, 80]

76 rows × 7 columns

最终确认dfc list

In [237]:
report_view(dqrs)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
str maybe num,  dtype: str, kind: num,  0
[]

str nunique>=8,: 8
['SaleType', 'Neighborhood', 'Condition1', 'Condition2', 'HouseStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd']

num maybe str,  dtype: num, kind: str,  10
['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'PoolArea', 'YrSold']

num nunique<=8,: 10
['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'PoolArea', 'YrSold']

num nunique>8,<=20,: 6
['3SsnPorch', 'MoSold', 'TotRmsAbvGrd', 'OverallQual', 'OverallCond', 'MSSubClass']
****************************** str, 38
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
****************************** int, 33
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
****************************** float, 3
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

异常值

发现

In [238]:
abnormal_value(df,dfc_num,dqrs)#print_value_list:bool=False
     col_name  dtype    kinds  null   空值率  nunique
0  MSSubClass  int64  numeric     0  0.0%       15
num index: * 0 * [[40, 180, 45, 75, 85, 190, 90, 80]]

quartile  非重复总数:15   异常值个数:103  比例:0.07054794520547945 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:15   异常值个数:30   比例:0.02054794520547945 !!!!!!!!!! 异常值过多,超过非重复总数
      col_name    dtype    kinds  null    空值率  nunique
2  LotFrontage  float64  numeric   259  17.7%      110
num index: * 2 * [[137.0, 150.0, 111.0, 153.0, 182.0, 46.0, 112.0, 149.0]]

quartile  非重复总数:110  异常值个数:88   比例:0.06027397260273973

+/-3 std  非重复总数:110  异常值个数:12   比例:0.00821917808219178
  col_name  dtype    kinds  null   空值率  nunique
3  LotArea  int64  numeric     0  0.0%     1073
num index: * 3 * [[8123, 9590, 9588, 9587, 13682, 21872, 7535, 12384]]

quartile  非重复总数:1073 异常值个数:69   比例:0.04726027397260274

+/-3 std  非重复总数:1073 异常值个数:13   比例:0.008904109589041096
       col_name  dtype    kinds  null   空值率  nunique
15  OverallQual  int64  numeric     0  0.0%       10
num index: * 15 * [[1, 2, 10, 3, 9, 4, 8, 7]]

quartile  非重复总数:10   异常值个数:2    比例:0.0013698630136986301

+/-3 std  非重复总数:10   异常值个数:2    比例:0.0013698630136986301
       col_name  dtype    kinds  null   空值率  nunique
16  OverallCond  int64  numeric     0  0.0%        9
num index: * 16 * [[1, 2, 9, 3, 4, 8, 7, 6]]

quartile  非重复总数:9    异常值个数:125  比例:0.08561643835616438 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:9    异常值个数:28   比例:0.019178082191780823 !!!!!!!!!! 异常值过多,超过非重复总数
     col_name  dtype    kinds  null   空值率  nunique
17  YearBuilt  int64  numeric     0  0.0%      112
num index: * 17 * [[1872, 1882, 1875, 1893, 2010, 1898, 1917, 1905]]

quartile  非重复总数:112  异常值个数:7    比例:0.004794520547945206

+/-3 std  非重复总数:112  异常值个数:6    比例:0.00410958904109589
        col_name  dtype    kinds  null   空值率  nunique
18  YearRemodAdd  int64  numeric     0  0.0%       61
num index: * 18 * [[1951, 1983, 1986, 1952, 2010, 1984, 1982, 1974]]


      col_name    dtype    kinds  null   空值率  nunique
24  MasVnrArea  float64  numeric     8  0.5%      327
num index: * 24 * [[67.0, 506.0, 459.0, 748.0, 603.0, 46.0, 922.0, 315.0]]

quartile  非重复总数:327  异常值个数:96   比例:0.06575342465753424

+/-3 std  非重复总数:327  异常值个数:32   比例:0.021917808219178082
      col_name  dtype    kinds  null   空值率  nunique
32  BsmtFinSF1  int64  numeric     0  0.0%      637
num index: * 32 * [[404, 222, 223, 224, 1441, 228, 1440, 1436]]

quartile  非重复总数:637  异常值个数:7    比例:0.004794520547945206

+/-3 std  非重复总数:637  异常值个数:6    比例:0.00410958904109589
      col_name  dtype    kinds  null   空值率  nunique
34  BsmtFinSF2  int64  numeric     0  0.0%      144
num index: * 34 * [[169, 692, 713, 764, 352, 768, 791, 690]]

quartile  非重复总数:144  异常值个数:167  比例:0.11438356164383562 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:144  异常值个数:50   比例:0.03424657534246575
     col_name  dtype    kinds  null   空值率  nunique
35  BsmtUnfSF  int64  numeric     0  0.0%      780
num index: * 35 * [[1777, 1085, 1082, 1079, 1078, 1073, 1045, 1072]]

quartile  非重复总数:780  异常值个数:29   比例:0.01986301369863014

+/-3 std  非重复总数:780  异常值个数:11   比例:0.007534246575342466
       col_name  dtype    kinds  null   空值率  nunique
36  TotalBsmtSF  int64  numeric     0  0.0%      721
num index: * 36 * [[1614, 1766, 1842, 662, 1926, 661, 1922, 602]]

quartile  非重复总数:721  异常值个数:61   比例:0.04178082191780822

+/-3 std  非重复总数:721  异常值个数:10   比例:0.00684931506849315
    col_name  dtype    kinds  null   空值率  nunique
41  1stFlrSF  int64  numeric     0  0.0%      753
num index: * 41 * [[969, 1576, 1578, 1582, 1586, 1593, 1604, 1554]]

quartile  非重复总数:753  异常值个数:20   比例:0.0136986301369863

+/-3 std  非重复总数:753  异常值个数:12   比例:0.00821917808219178
    col_name  dtype    kinds  null   空值率  nunique
42  2ndFlrSF  int64  numeric     0  0.0%      417
num index: * 42 * [[430, 892, 1028, 1031, 1037, 1038, 883, 882]]

quartile  非重复总数:417  异常值个数:2    比例:0.0013698630136986301

+/-3 std  非重复总数:417  异常值个数:4    比例:0.0027397260273972603
        col_name  dtype    kinds  null   空值率  nunique
43  LowQualFinSF  int64  numeric     0  0.0%       24
num index: * 43 * [[371, 514, 513, 481, 479, 473, 420, 397]]

quartile  非重复总数:24   异常值个数:26   比例:0.01780821917808219 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:24   异常值个数:20   比例:0.0136986301369863
     col_name  dtype    kinds  null   空值率  nunique
44  GrLivArea  int64  numeric     0  0.0%      861
num index: * 44 * [[2230, 1803, 1764, 1797, 1787, 1784, 1775, 1771]]

quartile  非重复总数:861  异常值个数:31   比例:0.021232876712328767

+/-3 std  非重复总数:861  异常值个数:16   比例:0.010958904109589041
        col_name  dtype        kinds  null   空值率  nunique
45  BsmtFullBath  int64  categorical     0  0.0%        4
num index: * 45 * [[3, 2, 1, 0]]

quartile  非重复总数:4    异常值个数:1    比例:0.0006849315068493151

+/-3 std  非重复总数:4    异常值个数:16   比例:0.010958904109589041 !!!!!!!!!! 异常值过多,超过非重复总数
        col_name  dtype        kinds  null   空值率  nunique
46  BsmtHalfBath  int64  categorical     0  0.0%        3
num index: * 46 * [[2, 1, 0]]

quartile  非重复总数:3    异常值个数:82   比例:0.056164383561643834 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:3    异常值个数:82   比例:0.056164383561643834 !!!!!!!!!! 异常值过多,超过非重复总数
    col_name  dtype        kinds  null   空值率  nunique
47  FullBath  int64  categorical     0  0.0%        4
num index: * 47 * [[0, 3, 1, 2]]


    col_name  dtype        kinds  null   空值率  nunique
48  HalfBath  int64  categorical     0  0.0%        3
num index: * 48 * [[2, 1, 0]]


+/-3 std  非重复总数:3    异常值个数:12   比例:0.00821917808219178 !!!!!!!!!! 异常值过多,超过非重复总数
        col_name  dtype        kinds  null   空值率  nunique
49  BedroomAbvGr  int64  categorical     0  0.0%        8
num index: * 49 * [[8, 0, 6, 5, 1, 4, 2, 3]]

quartile  非重复总数:8    异常值个数:35   比例:0.023972602739726026 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:8    异常值个数:14   比例:0.009589041095890411 !!!!!!!!!! 异常值过多,超过非重复总数
        col_name  dtype        kinds  null   空值率  nunique
50  KitchenAbvGr  int64  categorical     0  0.0%        4
num index: * 50 * [[0, 3, 2, 1]]

quartile  非重复总数:4    异常值个数:68   比例:0.04657534246575343 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:4    异常值个数:68   比例:0.04657534246575343 !!!!!!!!!! 异常值过多,超过非重复总数
        col_name  dtype    kinds  null   空值率  nunique
52  TotRmsAbvGrd  int64  numeric     0  0.0%       12
num index: * 52 * [[14, 2, 12, 3, 11, 10, 9, 4]]

quartile  非重复总数:12   异常值个数:30   比例:0.02054794520547945 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:12   异常值个数:12   比例:0.00821917808219178 !!!!!!!!!! 异常值过多,超过非重复总数
      col_name  dtype        kinds  null   空值率  nunique
54  Fireplaces  int64  categorical     0  0.0%        4
num index: * 54 * [[3, 2, 1, 0]]

quartile  非重复总数:4    异常值个数:5    比例:0.003424657534246575 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:4    异常值个数:5    比例:0.003424657534246575 !!!!!!!!!! 异常值过多,超过非重复总数
       col_name    dtype    kinds  null   空值率  nunique
56  GarageYrBlt  float64  numeric    81  5.5%       97
num index: * 56 * [[1906.0, 1908.0, 1900.0, 1933.0, 1927.0, 1942.0, 1915.0, 1914.0]]


+/-3 std  非重复总数:97   异常值个数:1    比例:0.0006849315068493151
      col_name  dtype        kinds  null   空值率  nunique
58  GarageCars  int64  categorical     0  0.0%        5
num index: * 58 * [[4, 0, 3, 1, 2]]

quartile  非重复总数:5    异常值个数:5    比例:0.003424657534246575 !!!!!!!!!! 异常值过多,超过非重复总数

      col_name  dtype    kinds  null   空值率  nunique
59  GarageArea  int64  numeric     0  0.0%      441
num index: * 59 * [[1418, 309, 318, 261, 319, 983, 254, 1356]]

quartile  非重复总数:441  异常值个数:21   比例:0.014383561643835616

+/-3 std  非重复总数:441  异常值个数:7    比例:0.004794520547945206
      col_name  dtype    kinds  null   空值率  nunique
63  WoodDeckSF  int64  numeric     0  0.0%      274
num index: * 63 * [[35, 416, 414, 409, 406, 404, 403, 402]]

quartile  非重复总数:274  异常值个数:32   比例:0.021917808219178082

+/-3 std  非重复总数:274  异常值个数:22   比例:0.015068493150684932
       col_name  dtype    kinds  null   空值率  nunique
64  OpenPorchSF  int64  numeric     0  0.0%      202
num index: * 64 * [[119, 235, 231, 229, 224, 214, 213, 210]]

quartile  非重复总数:202  异常值个数:77   比例:0.05273972602739726

+/-3 std  非重复总数:202  异常值个数:27   比例:0.018493150684931507
         col_name  dtype    kinds  null   空值率  nunique
65  EnclosedPorch  int64  numeric     0  0.0%      120
num index: * 65 * [[37, 226, 220, 218, 286, 214, 212, 210]]

quartile  非重复总数:120  异常值个数:208  比例:0.14246575342465753 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:120  异常值个数:51   比例:0.03493150684931507
     col_name  dtype    kinds  null   空值率  nunique
66  3SsnPorch  int64  numeric     0  0.0%       20
num index: * 66 * [[182, 96, 130, 140, 320, 153, 162, 304]]

quartile  非重复总数:20   异常值个数:24   比例:0.01643835616438356 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:20   异常值个数:23   比例:0.015753424657534248 !!!!!!!!!! 异常值过多,超过非重复总数
       col_name  dtype    kinds  null   空值率  nunique
67  ScreenPorch  int64  numeric     0  0.0%       76
num index: * 67 * [[99, 265, 266, 271, 273, 276, 287, 291]]

quartile  非重复总数:76   异常值个数:116  比例:0.07945205479452055 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:76   异常值个数:55   比例:0.03767123287671233
    col_name  dtype        kinds  null   空值率  nunique
68  PoolArea  int64  categorical     0  0.0%        8
num index: * 68 * [[738, 648, 576, 555, 519, 512, 480, 0]]

quartile  非重复总数:8    异常值个数:7    比例:0.004794520547945206

+/-3 std  非重复总数:8    异常值个数:7    比例:0.004794520547945206
   col_name  dtype    kinds  null   空值率  nunique
69  MiscVal  int64  numeric     0  0.0%       21
num index: * 69 * [[800, 350, 1400, 1300, 2500, 560, 3500, 620]]

quartile  非重复总数:21   异常值个数:52   比例:0.03561643835616438 !!!!!!!!!! 异常值过多,超过非重复总数

+/-3 std  非重复总数:21   异常值个数:8    比例:0.005479452054794521
   col_name  dtype    kinds  null   空值率  nunique
70   MoSold  int64  numeric     0  0.0%       12
num index: * 70 * [[2, 1, 12, 9, 11, 10, 3, 8]]


   col_name  dtype        kinds  null   空值率  nunique
71   YrSold  int64  categorical     0  0.0%        5
num index: * 71 * [[2010, 2008, 2006, 2007, 2009]]


Out[238]:
count mean std min 25% 50% 75% max IQR- IQR+ std-3 std+3
MSSubClass 1460.0 56.897260 42.300571 20.0 20.00 50.0 70.00 190.0 -55.00 145.00 -70.00 183.80
LotFrontage 1201.0 70.049958 24.284752 21.0 59.00 69.0 80.00 313.0 27.50 111.50 -2.80 142.90
LotArea 1460.0 10516.828082 9981.264932 1300.0 7553.50 9478.5 11601.50 215245.0 1481.50 17673.50 -19426.97 40460.62
OverallQual 1460.0 6.099315 1.382997 1.0 5.00 6.0 7.00 10.0 2.00 10.00 1.95 10.25
OverallCond 1460.0 5.575342 1.112799 1.0 5.00 5.0 6.00 9.0 3.50 7.50 2.24 8.91
YearBuilt 1460.0 1971.267808 30.202904 1872.0 1954.00 1973.0 2000.00 2010.0 1885.00 2069.00 1880.66 2061.88
YearRemodAdd 1460.0 1984.865753 20.645407 1950.0 1967.00 1994.0 2004.00 2010.0 1911.50 2059.50 1922.93 2046.80
MasVnrArea 1452.0 103.685262 181.066207 0.0 0.00 0.0 166.00 1600.0 -249.00 415.00 -439.51 646.88
BsmtFinSF1 1460.0 443.639726 456.098091 0.0 0.00 383.5 712.25 5644.0 -1068.38 1780.62 -924.65 1811.93
BsmtFinSF2 1460.0 46.549315 161.319273 0.0 0.00 0.0 0.00 1474.0 0.00 0.00 -437.41 530.51
BsmtUnfSF 1460.0 567.240411 441.866955 0.0 223.00 477.5 808.00 2336.0 -654.50 1685.50 -758.36 1892.84
TotalBsmtSF 1460.0 1057.429452 438.705324 0.0 795.75 991.5 1298.25 6110.0 42.00 2052.00 -258.69 2373.55
1stFlrSF 1460.0 1162.626712 386.587738 334.0 882.00 1087.0 1391.25 4692.0 118.12 2155.12 2.86 2322.39
2ndFlrSF 1460.0 346.992466 436.528436 0.0 0.00 0.0 728.00 2065.0 -1092.00 1820.00 -962.59 1656.58
LowQualFinSF 1460.0 5.844521 48.623081 0.0 0.00 0.0 0.00 572.0 0.00 0.00 -140.02 151.71
GrLivArea 1460.0 1515.463699 525.480383 334.0 1129.50 1464.0 1776.75 5642.0 158.62 2747.62 -60.98 3091.90
BsmtFullBath 1460.0 0.425342 0.518911 0.0 0.00 0.0 1.00 3.0 -1.50 2.50 -1.13 1.98
BsmtHalfBath 1460.0 0.057534 0.238753 0.0 0.00 0.0 0.00 2.0 0.00 0.00 -0.66 0.77
FullBath 1460.0 1.565068 0.550916 0.0 1.00 2.0 2.00 3.0 -0.50 3.50 -0.09 3.22
HalfBath 1460.0 0.382877 0.502885 0.0 0.00 0.0 1.00 2.0 -1.50 2.50 -1.13 1.89
BedroomAbvGr 1460.0 2.866438 0.815778 0.0 2.00 3.0 3.00 8.0 0.50 4.50 0.42 5.31
KitchenAbvGr 1460.0 1.046575 0.220338 0.0 1.00 1.0 1.00 3.0 1.00 1.00 0.39 1.71
TotRmsAbvGrd 1460.0 6.517808 1.625393 2.0 5.00 6.0 7.00 14.0 2.00 10.00 1.64 11.39
Fireplaces 1460.0 0.613014 0.644666 0.0 0.00 1.0 1.00 3.0 -1.50 2.50 -1.32 2.55
GarageYrBlt 1379.0 1978.506164 24.689725 1900.0 1961.00 1980.0 2002.00 2010.0 1899.50 2063.50 1904.44 2052.58
GarageCars 1460.0 1.767123 0.747315 0.0 1.00 2.0 2.00 4.0 -0.50 3.50 -0.47 4.01
GarageArea 1460.0 472.980137 213.804841 0.0 334.50 480.0 576.00 1418.0 -27.75 938.25 -168.43 1114.39
WoodDeckSF 1460.0 94.244521 125.338794 0.0 0.00 0.0 168.00 857.0 -252.00 420.00 -281.77 470.26
OpenPorchSF 1460.0 46.660274 66.256028 0.0 0.00 25.0 68.00 547.0 -102.00 170.00 -152.11 245.43
EnclosedPorch 1460.0 21.954110 61.119149 0.0 0.00 0.0 0.00 552.0 0.00 0.00 -161.40 205.31
3SsnPorch 1460.0 3.409589 29.317331 0.0 0.00 0.0 0.00 508.0 0.00 0.00 -84.54 91.36
ScreenPorch 1460.0 15.060959 55.757415 0.0 0.00 0.0 0.00 480.0 0.00 0.00 -152.21 182.33
PoolArea 1460.0 2.758904 40.177307 0.0 0.00 0.0 0.00 738.0 0.00 0.00 -117.77 123.29
MiscVal 1460.0 43.489041 496.123024 0.0 0.00 0.0 0.00 15500.0 0.00 0.00 -1444.88 1531.86
MoSold 1460.0 6.321918 2.703626 1.0 5.00 6.0 8.00 12.0 0.50 12.50 -1.79 14.43
YrSold 1460.0 2007.815753 1.328095 2006.0 2007.00 2008.0 2009.00 2010.0 2004.00 2012.00 2003.83 2011.80

-----填充

In [239]:
# df = fix_outlier(dfc_num,how='quartile').fit_transform(df) #dfc_num注意使用,有时候不需要全部数值列都处理
# df = fix_outlier([]).fit_transform(df) #自定义列表
# df = fix_outlier().fit_transform(df)

再确认

In [240]:
# abnormal_value(df,dfc_num,dqrs)

变量分析

-----dfc_cycle

In [241]:
dfc_cycle_index=-1
# dfc_cycle_list=dfc_num+dfc_str
dfc_cycle_list=dfc_str+dfc_num

def dfc_cycle (dfc_cycle_list:list,df:pd.DataFrame,dqrs:pd.DataFrame):
    global dfc_cycle_index
    dfc_cycle_index+=1
    if dfc_cycle_index < len(dfc_cycle_list):
        if ytype=='fenlei':
            if df.dtypes[dfc_cycle_list[dfc_cycle_index]]=='object':
                print(dfc_str_plt_fenlei_huigui(dfc_cycle_list[dfc_cycle_index],df,label_col_name='Y')) #fenlei_huigui一样, label_col不一样
            else:
                print(abnormal_value(df,[dfc_cycle_list[dfc_cycle_index]],dqrs))
                dfc_num_plt_fenlei(dfc_cycle_list[dfc_cycle_index],df) #分类
        else:
            if df.dtypes[dfc_cycle_list[dfc_cycle_index]]=='object':
                print(dfc_str_plt_fenlei_huigui(dfc_cycle_list[dfc_cycle_index],df,label_col_name='Ycut'))#fenlei_huigui一样, label_col不一样
            else:
                print(abnormal_value(df,[dfc_cycle_list[dfc_cycle_index]],dqrs))
                dfc_num_plt_huigui(dfc_cycle_list[dfc_cycle_index],df) #回归
    else:
        print('dfc_cycle complete. dfc_cycle_index:',dfc_cycle_index)
        dfc_cycle_index=0  

# dfc_cycle_list
In [242]:
dfc_cycle(dfc_cycle_list,df,dqrs)
p_value:  6.002756817064552e-59
Ycut      (129975.0, 163000.0]  (163000.0, 214000.0]  (214000.0, 755000.0]  \
MSZoning                                                                     
C (all)                    1.0                   0.0                   0.0   
FV                         8.0                  29.0                  28.0   
RH                         8.0                   2.0                   0.0   
RL                       287.0                 326.0                 326.0   
RM                        63.0                   9.0                   8.0   

Ycut      (34899.999, 129975.0]  
MSZoning                         
C (all)                     9.0  
FV                          0.0  
RH                          6.0  
RL                        212.0  
RM                        138.0  
In [243]:
# for x in df.x3.unique():
#     s1=df[df.x3==x]['Y']
#     print(x)
#     print(s1.mean(),s1.std())

卡方 类别 vs 分类

In [244]:
if ytype=='fenlei':
    fea_imp_list_kf_str_fenlei=fea_imp_kf_str_fenlei(df)
else:
    # 回归
    fea_imp_list_kf_str_fenlei=fea_imp_kf_str_fenlei(df,'Ycut')
      categorical     Kf-Value     Kf-P-Value
7    Neighborhood  1214.306130  5.542657e-207
17      ExterQual   725.401084  2.388526e-150
29    KitchenQual   722.057891  1.250527e-149
20       BsmtQual   710.778271  3.330933e-147
32   GarageFinish   523.240570  8.267739e-110
19     Foundation   517.647182  1.041686e-100
31     GarageType   457.125511   6.461484e-88
26      HeatingQC   360.420597   8.861934e-70
23   BsmtFinType1   330.754544   2.202050e-61
0        MSZoning   309.013344   6.002757e-59
14    Exterior1st   377.526137   1.591460e-55
16     MasVnrType   279.114846   6.965017e-55
15    Exterior2nd   357.004485   3.681940e-50
11     HouseStyle   251.069721   2.504394e-41
27     CentralAir   151.916053   1.017235e-32
37  SaleCondition   176.610274   1.145873e-29
2        LotShape   147.863558   2.442677e-27
22   BsmtExposure   135.481942   8.820406e-25
36       SaleType   168.085478   1.346183e-23
35     PavedDrive   119.321379   2.262530e-23
28     Electrical   128.049873   1.519149e-21
33     GarageQual    85.661295   3.395918e-13
10       BldgType    85.312751   3.962879e-13
8      Condition1   103.305899   8.158016e-12
34     GarageCond    74.475449   4.615986e-11
21       BsmtCond    66.309917   8.018672e-11
12      RoofStyle    65.223095   3.122658e-08
3     LandContour    45.027748   9.118229e-07
25        Heating    48.338244   2.242542e-05
18      ExterCond    41.881454   3.486998e-05
5       LotConfig    40.182262   6.706733e-05
30     Functional    49.374472   9.379561e-05
13       RoofMatl    34.146776   3.494951e-02
************************************************** should be removed feature
     categorical   Kf-Value  Kf-P-Value
24  BsmtFinType2  24.561049    0.056158
6      LandSlope  11.054732    0.086700
1         Street   6.028478    0.110233
9     Condition2  29.074553    0.112223
4      Utilities   2.980243    0.394681

anova

In [245]:
if ytype=='fenlei':
    # 数值 vs 分类
    fea_imp_list_num_ttest_anova_fenlei=fea_imp_num_ttest_anova_fenlei(df,dfc_num,'Y',fig_size=(8,8))
    print(fea_imp_list_num_ttest_anova_fenlei)
else:
    # 类别 vs 回归
    fea_imp_list_str_ttest_anova_huigui=fea_imp_str_ttest_anova_huigui(df,dfc_str,fig_size=(8,8))
    print(fea_imp_list_str_ttest_anova_huigui)
                     7             17            29          19           26  \
feature    Neighborhood     ExterQual   KitchenQual  Foundation    HeatingQC   
type              anova         anova         anova       anova        anova   
levene            False         False         False       False        False   
n                    25             4             4           6            5   
pval        1.5586e-225  1.43955e-204  3.03221e-192  5.7919e-91  2.66706e-67   
disparity       517.638       469.363       440.987     207.779      153.292   

                      37           14           15           36           0   \
feature    SaleCondition  Exterior1st  Exterior2nd     SaleType     MSZoning   
type               anova        anova        anova        anova        anova   
levene             False        False        False        False        False   
n                      6           15           16            9            5   
pval         7.98827e-44  2.58609e-43  4.84219e-43  5.03977e-42  8.81763e-35   
disparity        99.2358       98.061      97.4338      95.0912      78.4137   

                    11           2            27           35           12  \
feature     HouseStyle     LotShape   CentralAir   PavedDrive    RoofStyle   
type             anova        anova       t_test        anova        anova   
levene           False        False         True        False        False   
n                    8            4            2            3            6   
pval       3.37678e-25  6.44752e-25  1.80951e-22  1.80357e-18  3.65352e-17   
disparity      56.3477      55.7009      50.0638      40.8568      37.8483   

                    10           3            13           8            18  \
feature       BldgType  LandContour     RoofMatl   Condition1    ExterCond   
type             anova        anova        anova        anova        anova   
levene           False        False        False        False         True   
n                    5            4            8            9            5   
pval       2.05674e-10  2.74222e-08  7.23144e-08  8.90455e-08  5.10668e-07   
disparity      22.3047      17.4119      16.4422      16.2341      14.4875   

                    5           30           25          9         1   \
feature      LotConfig  Functional      Heating  Condition2    Street   
type             anova       anova        anova       anova    t_test   
levene            True       False         True        True     False   
n                    5           7            6           8         2   
pval       3.16317e-06  0.00048417  0.000753472   0.0434257  0.115048   
disparity      12.6639     7.63308      7.19082      3.1367   2.16241   

                  6          4           16        20        21            22  \
feature    LandSlope  Utilities  MasVnrType  BsmtQual  BsmtCond  BsmtExposure   
type           anova     t_test       anova     anova     anova         anova   
levene          True      False        True      True      True          True   
n                  3          2           4         4         4             4   
pval        0.141396        NaN         NaN       NaN       NaN           NaN   
disparity    1.95619        NaN         NaN       NaN       NaN           NaN   

                     23            24          28          31            32  \
feature    BsmtFinType1  BsmtFinType2  Electrical  GarageType  GarageFinish   
type              anova         anova       anova       anova         anova   
levene             True          True        True        True          True   
n                     6             6           5           6             3   
pval                NaN           NaN         NaN         NaN           NaN   
disparity           NaN           NaN         NaN         NaN           NaN   

                   33          34  
feature    GarageQual  GarageCond  
type            anova       anova  
levene           True        True  
n                   5           5  
pval              NaN         NaN  
disparity         NaN         NaN  

皮尔逊系数 数值 vs 回归

In [246]:
if ytype=='fenlei':
    dfcorr=dfc_plt_corr(df,fig_size=(10,10),label_col_name=dfc_num[0])
else:
    dfcorr=dfc_plt_corr(df,fig_size=(16,16))
In [247]:
dfcorr_largest(dfcorr,0.7)
Out[247]:
[('OverallQual', 'Y', '0.79'),
 ('YearBuilt', 'GarageYrBlt', '0.83'),
 ('TotalBsmtSF', '1stFlrSF', '0.82'),
 ('1stFlrSF', 'TotalBsmtSF', '0.82'),
 ('GrLivArea', 'TotRmsAbvGrd', '0.83'),
 ('GrLivArea', 'Y', '0.71'),
 ('TotRmsAbvGrd', 'GrLivArea', '0.83'),
 ('GarageYrBlt', 'YearBuilt', '0.83'),
 ('GarageCars', 'GarageArea', '0.88'),
 ('GarageArea', 'GarageCars', '0.88'),
 ('Y', 'OverallQual', '0.79'),
 ('Y', 'GrLivArea', '0.71')]
In [248]:
# s=dfcorr[dfcorr>0.5].Y.dropna()
# s.drop(index='Y')
# s1=dfcorr[dfcorr>0.5].MSSubClass.dropna()
# s1=pd.DataFrame(s1.drop(index='MSSubClass'))
# s1

斯皮尔曼系数 数值 vs 回归

In [249]:
def spearman(frame, features,label_col_name='Y'):
    spr = pd.DataFrame()
    spr['feature'] = features
    #Signature: a.corr(other, method='pearson', min_periods=None)
    #Docstring:
    #Compute correlation with `other` Series, excluding missing values
    # 计算特征和 SalePrice的 斯皮尔曼 相关系数
    spr['spearman'] = [frame[f].corr(frame[label_col_name], 'spearman') for f in features]
    spr = spr.sort_values('spearman')
    plt.figure(figsize=(6, 0.25*len(features))) # width, height
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')

spearman(df,dfc_num)

线性关系可视化 数值 vs 回归

In [250]:
dfc_num_plt_huigui_gmap_pairplot(df,dfc_num)

report 最终

In [251]:
DQReport(data=df, target='Y').SReport()
Out[251]:
col_name dtype kinds null 空值率 nunique unique_value
37 Heating object categorical 0 0.0% 6 [Floor, OthW, Wall, Grav, GasW, GasA]
30 BsmtExposure object categorical 38 2.6% 4 [Mn, Gd, Av, No]
31 BsmtFinType1 object categorical 37 2.5% 6 [LwQ, Rec, BLQ, ALQ, GLQ, Unf]
33 BsmtFinType2 object categorical 38 2.6% 6 [GLQ, ALQ, BLQ, LwQ, Rec, Unf]
38 HeatingQC object categorical 0 0.0% 5 [Po, Fa, Gd, TA, Ex]
... ... ... ... ... ... ... ...
34 BsmtFinSF2 int64 numeric 0 0.0% 144 [169, 692, 713, 764, 352, 768, 791, 690]
32 BsmtFinSF1 int64 numeric 0 0.0% 637 [404, 222, 223, 224, 1441, 228, 1440, 1436]
24 MasVnrArea float64 numeric 8 0.5% 327 [67.0, 506.0, 459.0, 748.0, 603.0, 46.0, 922.0...
56 GarageYrBlt float64 numeric 81 5.5% 97 [1906.0, 1908.0, 1900.0, 1933.0, 1927.0, 1942....
0 MSSubClass int64 numeric 0 0.0% 15 [40, 180, 45, 75, 85, 190, 90, 80]

76 rows × 7 columns

In [252]:
report_view(dqrs)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
str maybe num,  dtype: str, kind: num,  0
[]

str nunique>=8,: 8
['SaleType', 'Neighborhood', 'Condition1', 'Condition2', 'HouseStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd']

num maybe str,  dtype: num, kind: str,  10
['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'PoolArea', 'YrSold']

num nunique<=8,: 10
['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars', 'PoolArea', 'YrSold']

num nunique>8,<=20,: 6
['3SsnPorch', 'MoSold', 'TotRmsAbvGrd', 'OverallQual', 'OverallCond', 'MSSubClass']
****************************** str, 38
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
****************************** int, 33
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
****************************** float, 3
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

dftm

In [308]:
dftm=df.copy()
try:
    dftm=dftm.drop(['Ycut'],axis=1)
except:
    pass
dftm.head()
Out[308]:
MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition Y
0 60 RL 65.0 8450 Pave Reg Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2003 2003 Gable CompShg VinylSd VinylSd BrkFace 196.0 Gd TA PConc Gd TA No GLQ 706 Unf 0 150 856 GasA Ex Y SBrkr 856 854 0 1710 1 0 2 1 3 1 Gd 8 Typ 0 Attchd 2003.0 RFn 2 548 TA TA Y 0 61 0 0 0 0 0 2 2008 WD Normal 208500.0
1 20 RL 80.0 9600 Pave Reg Lvl AllPub FR2 Gtl Veenker Feedr Norm 1Fam 1Story 6 8 1976 1976 Gable CompShg MetalSd MetalSd None 0.0 TA TA CBlock Gd TA Gd ALQ 978 Unf 0 284 1262 GasA Ex Y SBrkr 1262 0 0 1262 0 1 2 0 3 1 TA 6 Typ 1 Attchd 1976.0 RFn 2 460 TA TA Y 298 0 0 0 0 0 0 5 2007 WD Normal 181500.0
2 60 RL 68.0 11250 Pave IR1 Lvl AllPub Inside Gtl CollgCr Norm Norm 1Fam 2Story 7 5 2001 2002 Gable CompShg VinylSd VinylSd BrkFace 162.0 Gd TA PConc Gd TA Mn GLQ 486 Unf 0 434 920 GasA Ex Y SBrkr 920 866 0 1786 1 0 2 1 3 1 Gd 6 Typ 1 Attchd 2001.0 RFn 2 608 TA TA Y 0 42 0 0 0 0 0 9 2008 WD Normal 223500.0
3 70 RL 60.0 9550 Pave IR1 Lvl AllPub Corner Gtl Crawfor Norm Norm 1Fam 2Story 7 5 1915 1970 Gable CompShg Wd Sdng Wd Shng None 0.0 TA TA BrkTil TA Gd No ALQ 216 Unf 0 540 756 GasA Gd Y SBrkr 961 756 0 1717 1 0 1 0 3 1 Gd 7 Typ 1 Detchd 1998.0 Unf 3 642 TA TA Y 0 35 272 0 0 0 0 2 2006 WD Abnorml 140000.0
4 60 RL 84.0 14260 Pave IR1 Lvl AllPub FR2 Gtl NoRidge Norm Norm 1Fam 2Story 8 5 2000 2000 Gable CompShg VinylSd VinylSd BrkFace 350.0 Gd TA PConc Gd TA Av GLQ 655 Unf 0 490 1145 GasA Ex Y SBrkr 1145 1053 0 2198 1 0 2 1 4 1 Gd 9 Typ 1 Attchd 2000.0 RFn 3 836 TA TA Y 192 84 0 0 0 0 0 12 2008 WD Normal 250000.0

xgb填空

In [309]:
dftm=xgb_fill().fit_transform(dftm,dftm.Y) #无Y列出错
  0%|                                                                                           | 0/38 [00:00<?, ?it/s]
categorical:    MSZoning
categorical:    Street
categorical:    LotShape
categorical:    LandContour
categorical:    Utilities
categorical:    LotConfig
categorical:    LandSlope
categorical:    Neighborhood
categorical:    Condition1
categorical:    Condition2
categorical:    BldgType
categorical:    HouseStyle
categorical:    RoofStyle
categorical:    RoofMatl
categorical:    Exterior1st
categorical:    Exterior2nd
categorical:    MasVnrType
****************************************************************************************************
 45%|████████████████████████████████████▋                                             | 17/38 [00:01<00:02,  9.42it/s]
categorical:    ExterQual
categorical:    ExterCond
categorical:    Foundation
categorical:    BsmtQual
****************************************************************************************************
 55%|█████████████████████████████████████████████▎                                    | 21/38 [00:05<00:05,  2.89it/s]
categorical:    BsmtCond
****************************************************************************************************
 58%|███████████████████████████████████████████████▍                                  | 22/38 [00:08<00:19,  1.19s/it]
categorical:    BsmtExposure
****************************************************************************************************
 61%|█████████████████████████████████████████████████▋                                | 23/38 [00:12<00:29,  2.00s/it]
categorical:    BsmtFinType1
****************************************************************************************************
 63%|███████████████████████████████████████████████████▊                              | 24/38 [00:17<00:41,  2.99s/it]
categorical:    BsmtFinType2
****************************************************************************************************
 66%|█████████████████████████████████████████████████████▉                            | 25/38 [00:21<00:40,  3.12s/it]
categorical:    Heating
categorical:    HeatingQC
categorical:    CentralAir
categorical:    Electrical
****************************************************************************************************
 76%|██████████████████████████████████████████████████████████████▌                   | 29/38 [00:24<00:21,  2.42s/it]
categorical:    KitchenQual
categorical:    Functional
categorical:    GarageType
****************************************************************************************************
 84%|█████████████████████████████████████████████████████████████████████             | 32/38 [00:28<00:12,  2.12s/it]
categorical:    GarageFinish
****************************************************************************************************
 87%|███████████████████████████████████████████████████████████████████████▏          | 33/38 [00:31<00:11,  2.38s/it]
categorical:    GarageQual
****************************************************************************************************
 89%|█████████████████████████████████████████████████████████████████████████▎        | 34/38 [00:34<00:09,  2.44s/it]
categorical:    GarageCond
****************************************************************************************************
100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [00:36<00:00,  1.03it/s]
  0%|                                                                                           | 0/36 [00:00<?, ?it/s]
categorical:    PavedDrive
categorical:    SaleType
categorical:    SaleCondition
numerical:      MSSubClass
numerical:      LotFrontage
****************************************************************************************************
  6%|████▌                                                                              | 2/36 [00:00<00:14,  2.27it/s]
numerical:      LotArea
numerical:      OverallQual
numerical:      OverallCond
numerical:      YearBuilt
numerical:      YearRemodAdd
numerical:      MasVnrArea
****************************************************************************************************
 22%|██████████████████▍                                                                | 8/36 [00:01<00:09,  2.80it/s]
numerical:      BsmtFinSF1
numerical:      BsmtFinSF2
numerical:      BsmtUnfSF
numerical:      TotalBsmtSF
numerical:      1stFlrSF
numerical:      2ndFlrSF
numerical:      LowQualFinSF
numerical:      GrLivArea
numerical:      BsmtFullBath
numerical:      BsmtHalfBath
numerical:      FullBath
numerical:      HalfBath
numerical:      BedroomAbvGr
numerical:      KitchenAbvGr
numerical:      TotRmsAbvGrd
numerical:      Fireplaces
numerical:      GarageYrBlt
****************************************************************************************************
100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:03<00:00, 11.64it/s]
 45%|████████████████████████████████████▏                                            | 17/38 [00:00<00:00, 118.31it/s]
numerical:      GarageCars
numerical:      GarageArea
numerical:      WoodDeckSF
numerical:      OpenPorchSF
numerical:      EnclosedPorch
numerical:      3SsnPorch
numerical:      ScreenPorch
numerical:      PoolArea
numerical:      MiscVal
numerical:      MoSold
numerical:      YrSold
100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [00:01<00:00, 23.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 70.42it/s]

null列 dftm查看

In [310]:
# dfc_null(dftm,0.001)
# dftm.isnull().sum().max()
dftm.isnull().sum().sum()
Out[310]:
0

-----str列处理, 空值填充

In [311]:
# dfc_type(dftm,True,False)[0]
In [312]:
# dftm['Y']=dftm['Y'].astype(int)
# dftm['Y'].value_counts()
# df['Y'] = np.where(df['Y'] == 'YES', 1, 0) #如果Y列不是0,1,2类型则记得转换

dftm['Embarked'].fillna(dftm['Embarked'].mode()[0],inplace=True)

DQReport(data=dftm, target='Y').SReport()

report dftm

In [313]:
dqrs=DQReport(data=dftm, target='Y').SReport()
In [314]:
report_view(dqrs)
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
str maybe num,  dtype: str, kind: num,  0
[]

str nunique>=8,: 8
['SaleType', 'Neighborhood', 'Condition1', 'Condition2', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'HouseStyle']

num maybe str,  dtype: num, kind: str,  10
['Fireplaces', 'GarageCars', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'YrSold', 'PoolArea']

num nunique<=8,: 10
['Fireplaces', 'GarageCars', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'YrSold', 'PoolArea']

num nunique>8,<=20,: 6
['MoSold', '3SsnPorch', 'MSSubClass', 'TotRmsAbvGrd', 'OverallQual', 'OverallCond']
****************************** str, 38
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
****************************** int, 33
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
****************************** float, 3
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

dftm_fea

In [315]:
dftm_fea=dftm.loc[:,['Y']+['BsmtQual', 'WoodDeckSF', 'HeatingQC', 'Foundation', 'SaleCondition', 'BsmtExposure', 'MSSubClass', 'Condition1', '1stFlrSF', 'RoofStyle', 'BldgType', 'LotShape', 'TotalBsmtSF', 'YearRemodAdd', 'ExterQual', 'RoofMatl', 'MasVnrArea', 'OverallQual', 'Neighborhood', 'GarageArea', 'BsmtFinSF1', 'GarageYrBlt', 'SaleType', 'HouseStyle', 'LotArea', '2ndFlrSF', 'MSZoning', 'CentralAir', 'YearBuilt', 'BsmtFinType1', 'LotFrontage', 'YrSold', 'ExterCond', 'BsmtUnfSF', 'KitchenQual', 'OpenPorchSF', 'PavedDrive', 'Exterior1st', 'Exterior2nd', 'LotConfig', 'MoSold', 'LandContour', 'MasVnrType', 'GrLivArea', 'OverallCond', 'BsmtCond']]

get_dummies

In [316]:
dftm=df_get_dummies(dftm)
# dftm.head()

dftm_fea=df_get_dummies(dftm_fea)

----train test split

In [317]:
x_train=dftm.drop(['Y'],axis=1)
try:
    x_train=dftm.drop(['Ycut'],axis=1) # 回归前面有可能有Ycut
except:
    pass
y_train=dftm['Y']

x_train_fea=dftm_fea.drop(['Y'],axis=1)
try:
    x_train_fea=dftm_fea.drop(['Ycut'],axis=1) # 回归前面有可能有Ycut
except:
    pass
y_train_fea=dftm_fea['Y']

特征重要性 xgb tm_plt

In [318]:
# 分类
# tm=xgb.XGBClassifier().fit(x_train, y_train)
# 回归
tm=xgb.XGBRegressor().fit(x_train, y_train)
In [319]:
tm_plt=tm_plt_importance(tm)
In [320]:
fea_imp_list_xgb=tm_plt_importance_list(tm_plt)
**************************************** important **************************************** 50
['LotFrontage', 'LotArea', 'MSSubClass', 'BsmtUnfSF', 'GrLivArea', 'GarageArea', 'TotalBsmtSF', 'BsmtFinSF1', '1stFlrSF', 'YearBuilt', 'MasVnrArea', 'OpenPorchSF', 'MoSold', 'YearRemodAdd', 'OverallQual', 'OverallCond', '2ndFlrSF', 'WoodDeckSF', 'GarageYrBlt', 'YrSold', 'TotRmsAbvGrd', 'EnclosedPorch', 'Fireplaces', 'ScreenPorch', 'BsmtFullBath', 'GarageCars', 'BedroomAbvGr', 'BsmtFinSF2', 'BsmtExposure', 'KitchenQual', 'GarageFinish', 'BsmtFinType1', 'SaleCondition', '3SsnPorch', 'LotShape', 'Exterior2nd', 'Exterior1st', 'RoofStyle', 'HeatingQC', 'MSZoning', 'SaleType', 'LotConfig', 'GarageType', 'MiscVal', 'MasVnrType', 'HalfBath', 'Neighborhood', 'Condition1', 'BsmtFinType2', 'BsmtHalfBath']
[63.0, 62.0, 61.0, 60.0, 59.0, 58.0, 57.0, 56.0, 55.0, 54.0, 53.0, 52.0, 51.0, 50.0, 49.0, 48.0, 47.0, 46.0, 45.0, 44.0, 43.0, 42.0, 41.0, 40.0, 39.0, 37.0, 34.0, 32.0, 31.5, 29.0, 29.0, 27.0, 26.0, 25.0, 22.5, 22.0, 20.5, 17.0, 16.666666666666668, 16.0, 15.5, 15.5, 14.0, 12.0, 10.0, 9.0, 5.5, 5.0, 4.0, 2.0]
*********************************** unimportant feature *********************************** 25
{'Heating', 'FullBath', 'Street', 'ExterQual', 'RoofMatl', 'Ycut', 'Utilities', 'HouseStyle', 'CentralAir', 'ExterCond', 'GarageCond', 'LandContour', 'KitchenAbvGr', 'Functional', 'GarageQual', 'BsmtQual', 'Condition2', 'LowQualFinSF', 'Foundation', 'BldgType', 'PoolArea', 'Electrical', 'PavedDrive', 'LandSlope', 'BsmtCond'}

-----特征最终筛选

In [321]:
top=20
s1=fea_imp_list_xgb[:top]
s2=fea_imp_list_kf_str_fenlei[0][:top]
if ytype=='fenlei':
    s3=list(fea_imp_list_num_ttest_anova_fenlei.loc['feature',:])
else:
    s3=list(fea_imp_list_str_ttest_anova_huigui.loc['feature',:])
s3=s3[:top]
In [322]:
print('xgb: ',len(s1))
print(s1)

print('')
if ytype=='fenlei':
    print('kf, str: ',len(s2))
else:
    print('kf, str. !!!回归的Ycut仅供参考: ',len(s2)) 
print(s2)

print('')
if ytype=='fenlei':
    print('anova, num. 用于分类: ',len(s3))
    print(s3)
else:
    print('anova, str. 用于回归:', len(s3))
    print(s3)
    
print('')
dfc_str,dfc_num,dfc_num_int,dfc_num_float=dfc_type(df,if_print=True)
# print('重要的str')
# print(list(set(tpil_top10).intersection(dfc_str)))
# print('不重要的str')
# print(list(set(dfc_str).difference(tpil)))
# print('重要的num')
# print(list(set(tpil_top10).intersection(dfc_num)))
# print('不重要的num')
# print(list(set(dfc_num).difference(tpil)))
xgb:  20
['LotFrontage', 'LotArea', 'MSSubClass', 'BsmtUnfSF', 'GrLivArea', 'GarageArea', 'TotalBsmtSF', 'BsmtFinSF1', '1stFlrSF', 'YearBuilt', 'MasVnrArea', 'OpenPorchSF', 'MoSold', 'YearRemodAdd', 'OverallQual', 'OverallCond', '2ndFlrSF', 'WoodDeckSF', 'GarageYrBlt', 'YrSold']

kf, str. !!!回归的Ycut仅供参考:  20
['MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1']

anova, str. 用于回归: 20
['Neighborhood', 'ExterQual', 'KitchenQual', 'Foundation', 'HeatingQC', 'SaleCondition', 'Exterior1st', 'Exterior2nd', 'SaleType', 'MSZoning', 'HouseStyle', 'LotShape', 'CentralAir', 'PavedDrive', 'RoofStyle', 'BldgType', 'LandContour', 'RoofMatl', 'Condition1', 'ExterCond']

****************************** str, 38
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']
****************************** int, 33
['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
****************************** float, 3
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
In [323]:
# xgb &卡方
s4=set(s1).intersection(s2)
print(s4)
set()
In [324]:
# xgb & anova
s5=set(s1).intersection(s3)
print(s5)
set()
In [325]:
# xgb & dfc_num
s6=set(s1).intersection(dfc_num)
print(len(s6))
print(s6)
20
{'WoodDeckSF', 'MSSubClass', '1stFlrSF', 'TotalBsmtSF', 'YearRemodAdd', 'MasVnrArea', 'OverallQual', 'GarageArea', 'BsmtFinSF1', 'GarageYrBlt', 'LotArea', '2ndFlrSF', 'YearBuilt', 'LotFrontage', 'YrSold', 'BsmtUnfSF', 'OpenPorchSF', 'MoSold', 'GrLivArea', 'OverallCond'}
In [326]:
s5.difference(s4)
Out[326]:
set()
In [327]:
set(s2).difference(s3)
Out[327]:
{'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtQual',
 'LotConfig',
 'MasVnrType'}
In [328]:
set(s3).difference(s2)
Out[328]:
{'CentralAir',
 'HeatingQC',
 'KitchenQual',
 'PavedDrive',
 'SaleCondition',
 'SaleType'}
In [329]:
set(s2).intersection(s3)
Out[329]:
{'BldgType',
 'Condition1',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Foundation',
 'HouseStyle',
 'LandContour',
 'LotShape',
 'MSZoning',
 'Neighborhood',
 'RoofMatl',
 'RoofStyle'}
In [330]:
s=list(set(s1+s2+s3))
print(len(s))
print(s)
46
['BsmtQual', 'WoodDeckSF', 'HeatingQC', 'Foundation', 'SaleCondition', 'BsmtExposure', 'MSSubClass', 'Condition1', '1stFlrSF', 'RoofStyle', 'BldgType', 'LotShape', 'TotalBsmtSF', 'YearRemodAdd', 'ExterQual', 'RoofMatl', 'MasVnrArea', 'OverallQual', 'Neighborhood', 'GarageArea', 'BsmtFinSF1', 'GarageYrBlt', 'SaleType', 'HouseStyle', 'LotArea', '2ndFlrSF', 'MSZoning', 'CentralAir', 'YearBuilt', 'BsmtFinType1', 'LotFrontage', 'YrSold', 'ExterCond', 'BsmtUnfSF', 'KitchenQual', 'OpenPorchSF', 'PavedDrive', 'Exterior1st', 'Exterior2nd', 'LotConfig', 'MoSold', 'LandContour', 'MasVnrType', 'GrLivArea', 'OverallCond', 'BsmtCond']

有无筛选特征 模型对比

In [332]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.25,random_state=0)

x_train_fea,x_test_fea,y_train_fea,y_test_fea = train_test_split(x_train_fea,y_train_fea,test_size=0.25,random_state=0)
In [333]:
print(x_train.shape)
print(len(y_train))
print(x_test.shape)
print(len(y_test))

print(x_train_fea.shape)
print(len(y_train_fea))
print(x_test_fea.shape)
print(len(y_test_fea))
(1095, 270)
1095
(365, 270)
365
(1095, 196)
1095
(365, 196)
365
In [334]:
from sklearn.preprocessing import StandardScaler
#标准化,返回值为标准化后的数据
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)

x_train_fea = std_x.fit_transform(x_train_fea)
x_test_fea = std_x.transform(x_test_fea)
In [335]:
#线性回归
from sklearn import linear_model
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

线性回归_无筛选特征

In [353]:
#gridsearch
params={'alpha':[0.001,0.01,0.1,1,10,100]}#粗调节
tm = Ridge()
tm=GridSearchCV(tm,params,cv=10)
tm.fit(x_train,y_train)  
tm.best_params_
Out[353]:
{'alpha': 100}
In [354]:
tm_names=[]
tm_scores_train=[]
tm_scores_test=[]
def tm_result_list(tm_name,tm_model):
    tm_names.append(tm_name)
    tm_scores_train.append(tm_model.score(x_train,y_train))
    tm_scores_test.append(tm_model.score(x_test,y_test))
In [355]:
tm_result_list('线性回归_无筛选特征',tm)
In [356]:
pd.DataFrame({'model':tm_names,'train':tm_scores_train,'test':tm_scores_test})
Out[356]:
model train test
0 线性回归_无筛选特征 0.939252 0.67534

线性回归_有筛选特征

In [357]:
params={'alpha':[0.001,0.01,0.1,1,10,100]}#粗调节
tm_fea= Ridge()
tm_fea=GridSearchCV(tm_fea,params,cv=10)
tm_fea.fit(x_train_fea,y_train_fea)  
tm_fea.best_params_
Out[357]:
{'alpha': 100}
In [358]:
tm_names.append('线性回归_有筛选特征')
tm_scores_train.append(tm_fea.score(x_train_fea,y_train_fea))
tm_scores_test.append(tm_fea.score(x_test_fea,y_test_fea))
In [359]:
pd.DataFrame({'model':tm_names,'train':tm_scores_train,'test':tm_scores_test})
Out[359]:
model train test
0 线性回归_无筛选特征 0.939252 0.675340
1 线性回归_有筛选特征 0.911585 0.739162

xgboost_无筛选特征

In [360]:
tm = xgb.XGBRFRegressor()
params = {
        'n_estimators':range(100,200,50),
        'max_depth':range(2,15,4),
        'learning_rate':np.linspace(0.01,3,5),
        }
tm=GridSearchCV(tm,param_grid=params,cv=5)
tm.fit(x_train, y_train)
tm.best_params_
Out[360]:
{'learning_rate': 0.7575000000000001, 'max_depth': 14, 'n_estimators': 150}
In [361]:
tm_result_list('xgboost_无筛选特征',tm)
In [362]:
pd.DataFrame({'model':tm_names,'train':tm_scores_train,'test':tm_scores_test})
Out[362]:
model train test
0 线性回归_无筛选特征 0.939252 0.675340
1 线性回归_有筛选特征 0.911585 0.739162
2 xgboost_无筛选特征 0.616141 0.496851

xgboost_有筛选特征

In [363]:
tm_fea = xgb.XGBRFRegressor()
params = {
        'n_estimators':range(100,200,50),
        'max_depth':range(2,15,4),
        'learning_rate':np.linspace(0.01,3,5),
        }
tm_fea=GridSearchCV(tm_fea,param_grid=params,cv=5)
tm_fea.fit(x_train_fea, y_train_fea)
tm_fea.best_params_
Out[363]:
{'learning_rate': 0.7575000000000001, 'max_depth': 14, 'n_estimators': 150}
In [364]:
tm_names.append('xgboost_有筛选特征')
tm_scores_train.append(tm_fea.score(x_train_fea,y_train_fea))
tm_scores_test.append(tm_fea.score(x_test_fea,y_test_fea))

只是筛选掉无用特征,其他条件一样的情况下,模型泛化能力提升。
- 线性回归提升8%,效果理想(线性回归对模型特征较敏感)。
- Xgboost提升不明显(Xgboost的基分类器决策树对无用特征不敏感)

In [365]:
pd.DataFrame({'model':tm_names,'train':tm_scores_train,'test':tm_scores_test})
Out[365]:
model train test
0 线性回归_无筛选特征 0.939252 0.675340
1 线性回归_有筛选特征 0.911585 0.739162
2 xgboost_无筛选特征 0.616141 0.496851
3 xgboost_有筛选特征 0.616419 0.517230